Load the data and check its structure (variables, missing values, data types)

library(tidyverse)  
── Attaching core tidyverse packages ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(here)       
here() starts at C:/Users/Lenovo/Documents/RStudioProjects/UFO-Sightings---R-Project
library(withr)      
ufo_sightings <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/ufo_sightings.csv')
Rows: 96429 Columns: 12── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (7): city, state, country_code, shape, reported_duration, summary, day_part
dbl  (1): duration_seconds
lgl  (1): has_images
dttm (2): reported_date_time, reported_date_time_utc
date (1): posted_date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
places <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/places.csv')
Rows: 14417 Columns: 10── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (6): city, alternate_city_names, state, country, country_code, timezone
dbl (4): latitude, longitude, population, elevation_m
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
day_parts_map <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/day_parts_map.csv')
Rows: 26409 Columns: 12── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl  (2): rounded_lat, rounded_long
date (1): rounded_date
time (9): astronomical_twilight_begin, nautical_twilight_begin, civil_twilight_begin, sunrise, solar_noon, sunset, civil_twilight_end, nauti...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Checking if files have been loaded correctly

head(ufo_sightings)
head(places)
head(day_parts_map)

Saving data in respective files

dir.create(here("data", "2023", "2023-06-20"), recursive = TRUE, showWarnings = FALSE)

write_csv(ufo_sightings, here("data", "2023", "2023-06-20", "ufo_sightings.csv"))
write_csv(places, here("data", "2023", "2023-06-20", "places.csv"))
write_csv(day_parts_map, here("data", "2023", "2023-06-20", "day_parts_map.csv"))

Checking missing data

glimpse(ufo_sightings)
Rows: 96,429
Columns: 12
$ reported_date_time     <dttm> 2022-08-29 06:03:00, 2022-08-20 01:51:00, 2022-08-13 05:30:00, 2022-08-06 21:00:00, 2022-08-04 07:40:00, 2022-…
$ reported_date_time_utc <dttm> 2022-08-29 06:03:00, 2022-08-20 01:51:00, 2022-08-13 05:30:00, 2022-08-06 21:00:00, 2022-08-04 07:40:00, 2022-…
$ posted_date            <date> 2022-09-09, 2022-10-08, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 20…
$ city                   <chr> "Pinehurst", "Rapid City", "Cleveland", "Bloomington", "Irvine", "Moore", "Short Pump", "Norwalk", "Blayney", "…
$ state                  <chr> "NC", "MI", "OH", "IN", "CA", "OK", "VA", "CT", "New South Wales", "WY", "NH", "AZ", "FL", "OR", "Haryana", "NM…
$ country_code           <chr> "US", "US", "US", "US", "US", "US", "US", "US", "AU", "US", "US", "US", "US", "US", "IN", "US", "US", "US", "US…
$ shape                  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ reported_duration      <chr> "15 mins\u0085", "1 minute", "2 hours", "30 seconds", "3 minutes", "10 minutes", "20 seconds", "5 minutes", "90…
$ duration_seconds       <dbl> 900, 60, 172800, 30, 180, 600, 20, 300, 120, 1800, 10, 3, 45, 60, 240, 32, 300, 600, 180, 1200, 45, 300, 180, 1…
$ summary                <chr> "Saw multi color object above horizon.", "An object in the shape of a straight line about an inch from our view…
$ has_images             <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
$ day_part               <chr> "night", "nautical dusk", "night", "afternoon", "night", "morning", "morning", "afternoon", NA, "astronomical d…
glimpse(places)
Rows: 14,417
Columns: 10
$ city                 <chr> "Pinehurst", "Rapid City", "Cleveland", "Bloomington", "Irvine", "Moore", "Short Pump", "Norwalk", "Blayney", "Gr…
$ alternate_city_names <chr> "Pajnkherst,bynhwrst,pynhwrst  karwlynay shmaly,Пајнхерст,بينهورست,پینهورست، کارولینای شمالی", NA, "CLE,Cleavelan…
$ state                <chr> "NC", "MI", "OH", "IN", "CA", "OK", "VA", "CT", "New South Wales", "WY", "NH", "AZ", "FL", "OR", "Haryana", "NM",…
$ country              <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "Australia", "USA", "USA", "USA", "USA", "USA", "India", …
$ country_code         <chr> "US", "US", "US", "US", "US", "US", "US", "US", "AU", "US", "US", "US", "US", "US", "IN", "US", "US", "US", "US",…
$ latitude             <dbl> 35.19543, 44.83445, 41.49950, 39.16533, 33.66946, 35.33951, 37.65042, 41.11760, -33.53233, 44.48912, 44.05368, 33…
$ longitude            <dbl> -79.46948, -85.28256, -81.69541, -86.52639, -117.82311, -97.48670, -77.61249, -73.40790, 149.25367, -108.05621, -…
$ timezone             <chr> "America/New_York", "America/Detroit", "America/New_York", "America/Indiana/Indianapolis", "America/Los_Angeles",…
$ population           <dbl> 15752, 1352, 388072, 84067, 256927, 60451, 24729, 88485, 3355, 1879, 2349, 1608139, 71579, 7912, 197340, 559121, …
$ elevation_m          <dbl> 160, 192, 199, 235, 17, 382, 89, 11, NA, 1156, 160, 331, 1, 157, NA, 1511, 414, 2, 393, 89, 1373, 264, 1, 89, 220…
glimpse(day_parts_map)
Rows: 26,409
Columns: 12
$ rounded_lat                 <dbl> 40, 40, 40, 40, 30, 40, 40, 40, -30, 40, 40, 30, 30, 40, 30, 40, 30, 30, 40, 30, 40, 30, 40, 30, 50, 40, 2…
$ rounded_long                <dbl> -80, -90, -80, -90, -120, -100, -80, -70, 150, -110, -70, -110, -80, -120, 80, -110, -120, -80, -100, -120…
$ rounded_date                <date> 2022-08-28, 2022-08-21, 2022-08-14, 2022-08-07, 2022-08-07, 2022-07-24, 2022-07-17, 2022-07-17, 2022-07-1…
$ astronomical_twilight_begin <time> 09:07:43, 09:38:33, 08:48:54, 09:19:00, 11:55:02, 09:39:05, 08:09:39, 07:29:37, 19:30:59, 10:01:25, 07:21…
$ nautical_twilight_begin     <time> 09:42:49, 10:14:53, 09:26:40, 09:58:23, 12:26:50, 10:22:03, 08:54:27, 08:14:25, 20:00:01, 10:47:51, 08:07…
$ civil_twilight_begin        <time> 10:16:18, 10:49:12, 10:01:56, 10:34:41, 12:57:22, 11:00:33, 09:34:01, 08:54:00, 20:29:37, 11:28:21, 08:48…
$ sunrise                     <time> 10:42:53, 11:16:15, 10:29:32, 11:02:53, 13:21:39, 11:30:01, 10:04:06, 09:24:04, 20:54:22, 11:58:55, 09:18…
$ solar_noon                  <time> 17:21:10, 18:03:05, 17:24:39, 18:05:45, 20:05:45, 18:46:32, 17:26:12, 16:46:12, 02:05:20, 19:25:26, 16:45…
$ sunset                      <time> 23:59:27, 00:49:55, 00:19:45, 01:08:37, 02:49:51, 02:03:03, 00:48:18, 00:08:20, 07:16:18, 02:51:58, 00:12…
$ civil_twilight_end          <time> 00:26:02, 01:16:58, 00:47:21, 01:36:50, 03:14:07, 02:32:31, 01:18:23, 00:38:24, 07:41:03, 03:22:32, 00:42…
$ nautical_twilight_end       <time> 00:59:31, 01:51:17, 01:22:37, 02:13:08, 03:44:39, 03:11:02, 01:57:57, 01:17:59, 08:10:40, 04:03:02, 01:23…
$ astronomical_twilight_end   <time> 01:34:37, 02:27:37, 02:00:24, 02:52:31, 04:16:27, 03:54:00, 02:42:45, 02:02:47, 08:39:42, 04:49:28, 02:09…
colSums(is.na(ufo_sightings))
    reported_date_time reported_date_time_utc            posted_date                   city                  state           country_code 
                     0                      0                      0                      0                     85                      0 
                 shape      reported_duration       duration_seconds                summary             has_images               day_part 
                  2039                      0                      0                     31                      0                   2563 
colSums(is.na(places))
                city alternate_city_names                state              country         country_code             latitude 
                   0                 2953                   32                    0                    0                    0 
           longitude             timezone           population          elevation_m 
                   0                    0                    0                 2285 
colSums(is.na(day_parts_map))
                rounded_lat                rounded_long                rounded_date astronomical_twilight_begin     nautical_twilight_begin 
                          0                           0                           0                         951                         122 
       civil_twilight_begin                     sunrise                  solar_noon                      sunset          civil_twilight_end 
                          2                           2                           0                           2                           2 
      nautical_twilight_end   astronomical_twilight_end 
                        122                         951 

Summary of the data

dim(ufo_sightings)
[1] 96429    12
summary(ufo_sightings)
 reported_date_time               reported_date_time_utc            posted_date             city              state          
 Min.   :1925-12-29 00:00:00.00   Min.   :1925-12-29 00:00:00.00   Min.   :1998-03-07   Length:96429       Length:96429      
 1st Qu.:2004-10-01 05:10:00.00   1st Qu.:2004-10-01 05:10:00.00   1st Qu.:2006-10-30   Class :character   Class :character  
 Median :2012-02-05 03:00:00.00   Median :2012-02-05 03:00:00.00   Median :2012-08-19   Mode  :character   Mode  :character  
 Mean   :2009-04-30 02:41:30.98   Mean   :2009-04-30 02:41:30.98   Mean   :2011-09-26                                        
 3rd Qu.:2016-01-25 03:30:00.00   3rd Qu.:2016-01-25 03:30:00.00   3rd Qu.:2016-07-15                                        
 Max.   :2023-05-18 19:27:00.00   Max.   :2023-05-18 19:27:00.00   Max.   :2023-05-19                                        
 country_code          shape           reported_duration  duration_seconds      summary          has_images        day_part        
 Length:96429       Length:96429       Length:96429       Min.   :0.000e+00   Length:96429       Mode :logical   Length:96429      
 Class :character   Class :character   Class :character   1st Qu.:3.000e+01   Class :character   FALSE:96429     Class :character  
 Mode  :character   Mode  :character   Mode  :character   Median :1.800e+02   Mode  :character                   Mode  :character  
                                                          Mean   :3.161e+04                                                        
                                                          3rd Qu.:6.000e+02                                                        
                                                          Max.   :1.987e+09                                                        
dim(places)
[1] 14417    10
summary(places)
     city           alternate_city_names    state             country          country_code          latitude        longitude      
 Length:14417       Length:14417         Length:14417       Length:14417       Length:14417       Min.   :-53.15   Min.   :-170.48  
 Class :character   Class :character     Class :character   Class :character   Class :character   1st Qu.: 34.99   1st Qu.: -95.46  
 Mode  :character   Mode  :character     Mode  :character   Mode  :character   Mode  :character   Median : 40.09   Median : -84.21  
                                                                                                  Mean   : 37.76   Mean   : -75.36  
                                                                                                  3rd Qu.: 42.96   3rd Qu.: -74.82  
                                                                                                  Max.   : 70.64   Max.   : 179.19  
                                                                                                                                    
   timezone           population        elevation_m    
 Length:14417       Min.   :       0   Min.   : -57.0  
 Class :character   1st Qu.:    1926   1st Qu.:  65.0  
 Mode  :character   Median :    6085   Median : 194.0  
                    Mean   :   86375   Mean   : 288.2  
                    3rd Qu.:   21993   3rd Qu.: 304.0  
                    Max.   :22315474   Max.   :3097.0  
                                       NA's   :2285    
dim(day_parts_map)
[1] 26409    12
summary(day_parts_map)
  rounded_lat      rounded_long      rounded_date        astronomical_twilight_begin nautical_twilight_begin civil_twilight_begin
 Min.   :-50.00   Min.   :-170.00   Min.   :1925-12-27   Length:26409                Length:26409            Length:26409        
 1st Qu.: 30.00   1st Qu.:-110.00   1st Qu.:1999-01-17   Class1:hms                  Class1:hms              Class1:hms          
 Median : 40.00   Median : -90.00   Median :2007-03-25   Class2:difftime             Class2:difftime         Class2:difftime     
 Mean   : 36.23   Mean   : -80.01   Mean   :2004-03-18   Mode  :numeric              Mode  :numeric          Mode  :numeric      
 3rd Qu.: 40.00   3rd Qu.: -80.00   3rd Qu.:2014-10-12                                                                           
 Max.   : 70.00   Max.   : 180.00   Max.   :2023-05-21                                                                           
   sunrise          solar_noon          sunset         civil_twilight_end nautical_twilight_end astronomical_twilight_end
 Length:26409      Length:26409      Length:26409      Length:26409       Length:26409          Length:26409             
 Class1:hms        Class1:hms        Class1:hms        Class1:hms         Class1:hms            Class1:hms               
 Class2:difftime   Class2:difftime   Class2:difftime   Class2:difftime    Class2:difftime       Class2:difftime          
 Mode  :numeric    Mode  :numeric    Mode  :numeric    Mode  :numeric     Mode  :numeric        Mode  :numeric           
                                                                                                                         
                                                                                                                         
sum(duplicated(ufo_sightings))
[1] 3

Handle missing observations (fill in or remove them), correct errors, etc.

Due to the large size of the file on which the analysis is performed, only some of the data was used for visual representation.

library(tidyverse)  
library(here)       
library(withr)
library(naniar)

UFO sightings

ufo_sightings %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)

Replacing missing data with most frequent entry and saving cleaned data

most_common_day_part <- ufo_sightings %>%
  count(day_part) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  pull(day_part)

most_common_shape <- ufo_sightings %>%
  count(shape) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  pull(shape)

# najczestrza wartosc
ufo_clean <- ufo_sightings %>%
  mutate(
    day_part = ifelse(is.na(day_part), most_common_day_part, day_part),
    shape = ifelse(is.na(shape), most_common_shape, shape)
  )

write_csv(ufo_clean, here("data", "2023", "2023-06-20", "ufo_clean.csv"))
ufo_clean %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)

Places

places %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)

Replacing missing city name with empty string, elevation with mean value and saving cleaned data

#nie mam lepszego pomysłu na uzuepłenienie niż " "
places_clean <- places %>%
  mutate(
    alternate_city_names = ifelse(is.na(alternate_city_names), " ", alternate_city_names)
  )

median_elevation <- median(places$elevation_m, na.rm = TRUE)

places_clean <- places_clean %>%
  mutate(
    elevation_m = ifelse(is.na(elevation_m), median_elevation, elevation_m)
  )

write_csv(places_clean, here("data", "2023", "2023-06-20", "places_clean.csv"))
places_clean %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)

Day parts map

day_parts_map %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)

Replacing missing data with mean value and saving cleaned data

day_parts_clean <- day_parts_map %>%
  mutate(
    astronomical_twilight_begin = ifelse(
      is.na(astronomical_twilight_begin),
      median(astronomical_twilight_begin, na.rm = TRUE),
      astronomical_twilight_begin
    ),
    astronomical_twilight_end = ifelse(
      is.na(astronomical_twilight_end),
      median(astronomical_twilight_end, na.rm = TRUE),
      astronomical_twilight_end
    )
  )

write_csv(day_parts_clean, here("data", "2023", "2023-06-20", "day_parts_clean.csv"))
day_parts_clean %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)

Adjust the data format to meet the requirements of your analysis


ufo_model_data <- ufo_clean %>%
  filter(!is.na(shape), !is.na(reported_duration), !is.na(summary))


places_model_data <- places_clean %>%
  filter(!is.na(alternate_city_names))
glimpse(ufo_model_data)
glimpse(places_model_data)
glimpse(day_parts_clean)
write_csv(ufo_model_data, here("data", "2023", "2023-06-20", "ufo_model_data.csv"))
write_csv(places_model_data, here("data", "2023", "2023-06-20", "places_model_data.csv"))

Add new columns to each dataframe

library(lubridate)
library(dplyr)
library(hms)

Dołączanie pakietu: ‘hms’

Następujący obiekt został zakryty z ‘package:lubridate’:

    hms

Adding new columns to “sightings” dataframe

ufo_model_data_mutated <- ufo_model_data %>%
  mutate(
    year = year(reported_date_time),
    month = month(reported_date_time),
    weekday = wday(reported_date_time, label = TRUE, abbr = FALSE, locale = "C"),
    is_weekend = weekday %in% c("Sat", "Sun"),
    country_upper = toupper(country_code),
    report_hour = hour(reported_date_time),
    city_state = paste(city, state, sep = ", "),
    report_delay_days = as.numeric(difftime(posted_date, as.Date(reported_date_time), units = "days"))
  )

ufo_model_data_mutated

Adding new columns to “places” dataframe

places_model_data_mutated <- places_model_data %>%
  mutate(
    city_state = paste(city, state, sep = ", "),
    is_us = country_code == "US",
    population_log = log1p(population),
    hemisphere = ifelse(latitude >= 0, "Northern", "Southern"),
    is_coastal = abs(longitude) < 80 | abs(longitude) > 120,
    pop_category = case_when(
      population < 10000 ~ "small",
      population < 100000 ~ "medium",
      TRUE ~ "large"
    ),
    elevation_category = case_when(
      is.na(elevation_m) ~ "unknown",
      elevation_m < 100 ~ "low",
      elevation_m < 500 ~ "medium",
      TRUE ~ "high"
    ),
    name_length = nchar(city),
    timezone_area = sapply(strsplit(timezone, "/"), `[`, 2)
  )

places_model_data_mutated

Adding new columns to “day parts” dataframe

day_parts_model_mutated <- day_parts_clean %>%
  mutate(
    daylight_duration = as.numeric(sunset - sunrise, units = "secs"),
    is_northern_hemisphere = rounded_lat >= 0,
    sunrise_hour = hour(sunrise),
    sunset_hour = hour(sunset),
    is_day_short = daylight_duration < 36000, # mniej niż 10h
    twilight_duration = as.numeric(astronomical_twilight_end - astronomical_twilight_begin, units = "secs"),
    is_long_twilight = twilight_duration > 5400, # 1.5h
    sunrise_minutes = hour(sunrise) * 60 + minute(sunrise),
    solar_noon_minutes = hour(solar_noon) * 60 + minute(solar_noon),
    sunset_minutes = hour(sunset) * 60 + minute(sunset)
  )

day_parts_model_mutated
library(ggplot2)
library(dplyr)
library(sf)
Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(rnaturalearth)
library(rnaturalearthdata)

Dołączanie pakietu: ‘rnaturalearthdata’

Następujący obiekt został zakryty z ‘package:rnaturalearth’:

    countries110

Explore data with charts

library(ggplot2)
library(dplyr)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)

##Number of sightings per day

ufo_model_data_mutated %>%
  count(date = as.Date(reported_date_time)) %>%
  ggplot(aes(x = date, y = n)) +
  geom_line(color = "steelblue") +
  labs(title = "Number of sightings per day", x = "Date", y = "Number of sightings")

Interpretation:

The chart shows daily UFO sightings over time. Sightings were rare before 1960, gradually increased through the 1990s, and peaked between 2000 and 2015. After 2015, the number of reports declined sharply. This suggests that UFO sightings may be influenced by media, public interest, or reporting practices.

Annual Trend of UFO Sightings

ufo_model_data_mutated %>%
  mutate(year = lubridate::year(reported_date_time)) %>%
  count(year) %>%
  ggplot(aes(x = year, y = n)) +
  geom_line(color = "darkblue") +
  geom_smooth(se = FALSE, color = "red", method = "loess") +
  labs(
    title = "Annual Trend of UFO Sightings",
    x = "Year",
    y = "Number of Sightings"
  ) +
  theme_minimal()

Interpretation:

The chart shows a clear rise in UFO sightings from the 1980s to around 2012, with a peak near 2014. After that, there’s a sharp decline in reports. The red loess curve highlights a long-term upward trend followed by a recent downward shift. This may reflect changes in reporting behavior, public interest, or data availability over time.

##Number of sightings depending on the day of the week

ufo_model_data_mutated %>%
  count(weekday) %>%
  ggplot(aes(x = weekday, y = n)) +
  geom_col(fill = "orange") +
  labs(title = "Sightings depending on the day of the week", x = "Day of the week", y = "Number of sightings")

Interpretation:

The number of UFO sightings varies by day of the week. The highest counts occur on Saturdays and Sundays, while Tuesdays have the fewest reports. This suggests people are more likely to notice and report sightings during weekends, possibly due to having more free time or being outdoors more often.

Hourly distribution of sightings

ufo_model_data_mutated %>%
  mutate(hour = hour(reported_date_time)) %>%
  count(hour) %>%
  ggplot(aes(x = hour, y = n)) +
  geom_col(fill = "purple") +
  labs(title = "Hourly distribution of sightings", x = "Hour of the day", y = "Number of sightings")

Interpretation:

UFO sightings are most frequently reported between 8 PM and 3 AM, peaking around 2 AM. Sightings are least common during midday hours. This pattern suggests that sightings are more likely to occur—or at least be noticed and reported—at night, when the sky is dark and unusual lights are more visible.

Heatmap: day of the week vs hour of the day

ufo_model_data_mutated %>%
  mutate(
    hour = hour(reported_date_time),
    weekday = fct_relevel(weekday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))
  ) %>%
  count(weekday, hour) %>%
  ggplot(aes(x = hour, y = weekday, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c() +
  labs(title = "Heatmap: day of the week vs hour of the day", x = "Hour of the day", y = "Day of the week", fill = "Number of sightings")

Interpretation:

The heatmap shows the distribution of UFO sightings by hour of the day and day of the week. Most sightings occur after midnight on Sunday, peaking between 1–3 AM. Other late-night hours, especially on weekends, also show elevated counts. This pattern reinforces that sightings are more frequent during late-night weekend hours, when people are likely to be awake and outdoors in dark conditions.

Sightins with images vs no image

ufo_model_data_mutated %>%
  mutate(image_status = ifelse(has_images, "Has an image", "Has no image")) %>%
  count(image_status) %>%
  ggplot(aes(x = "", y = n, fill = image_status)) +
  geom_col(width = 1) +
  coord_polar(theta = "y") +
  labs(title = "Sightins with images vs no image", fill = "Image existence") +
  theme_void() +
  scale_fill_manual(values = c("Has an image" = "#66BB6A", "Has no image" = "#EF5350"))

Interpretation:

The chart shows that almost all UFO sightings lack images. Sightings with images are extremely rare, suggesting that reports are usually text-based or anecdotal. This indicates a strong reliance on witness testimony rather than visual evidence in the dataset.

Making sure if above piechart is correct

sum(ufo_model_data_mutated$has_images != FALSE, na.rm = TRUE)

Number of sightings per shape

ufo_model_data_mutated %>%
  count(shape) %>%
  ggplot(aes(x = reorder(shape, n), y = n)) +
  geom_col(fill = "skyblue") +
  coord_flip() +
  labs(title = "Number of sightings per shape", x = "Shape", y = "Number of sightings")

Interpretation:

The most commonly reported UFO shapes are light, circle, and triangle. Unusual shapes like cube, star, and cross are very rare. This suggests that most sightings describe simple or glowing forms, possibly influenced by visibility, perception, or common cultural imagery.

Number of sightings per country

ufo_model_data_mutated %>%
  count(country_code) %>%
  filter(n >= 100) %>%
  ggplot(aes(x = reorder(country_code, n), y = n)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(
    title = "Number of sightings per country",
    x = "Country code",
    y = "Number of sightings"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Interpretation:

The vast majority of UFO sightings come from the United States, with over 80,000 reports. Other countries like Canada and Great Britain have significantly fewer sightings. This suggests that the dataset is strongly US-centric, possibly due to better reporting infrastructure, public interest, or data source bias.

Density map of sightings

world <- ne_countries(scale = "medium", returnclass = "sf")

places_clean %>%
  filter(!is.na(latitude), !is.na(longitude)) %>%
  mutate(
    pop_category = case_when(
      population < 10000  ~ "small",
      population < 100000 ~ "medium",
      TRUE                ~ "large"
    )
  ) %>%
  ggplot() +
  geom_sf(data = world, fill = "lightgray", color = "black") +  # Dodajemy mapę
  geom_point(aes(
    x = longitude, y = latitude,
    size = population, color = pop_category
  ), alpha = 0.6) +
  scale_size(range = c(1, 6), guide = "none") +
  labs(
    title = "Cities with UFO sightings",
    subtitle = "Point size ~ population, color ~ population category",
    x = "Latitude",
    y = "Altitude",
    color = "Population category"
  ) +
  theme_minimal()

Interpretation:

Sightings are most densely clustered in North America and Europe, especially in large urban areas. This suggests that population density and infrastructure may influence reporting frequency. Other regions show fewer reports, which could reflect lower reporting access or less data availability.

UFO Sightings by City Population Category

ufo_model_data_mutated %>%
  left_join(places_model_data_mutated %>% select(city_state, pop_category), by = "city_state") %>%
  count(pop_category) %>%
  ggplot(aes(x = pop_category, y = n, fill = pop_category)) +
  geom_col(show.legend = FALSE) +
  labs(
    title = "UFO Sightings by City Population Category",
    x = "Population Category",
    y = "Number of Sightings"
  ) +
  scale_fill_manual(values = c("small" = "#91bfdb", "medium" = "#fdae61", "large" = "#d73027")) +
  theme_minimal()
Ostrzeżenie: Detected an unexpected many-to-many relationship between `x` and `y`.

Interpretation:

The chart shows that most UFO sightings come from medium-sized cities, followed by large cities, with small towns reporting the fewest. This suggests that mid-sized urban areas may offer a balance of visibility, outdoor activity, and public engagement conducive to sightings. It also reflects where people live and are most likely to report unusual events.

---
title: "Exploratory Data Analasys with UFO signals dataset"
output: html_notebook
---

# Load the data and check its structure (variables, missing values, data types)

```{r}
library(tidyverse)  
library(here)       
library(withr)      
```

```{r}
ufo_sightings <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/ufo_sightings.csv')
places <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/places.csv')
day_parts_map <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/day_parts_map.csv')

```

Checking if files have been loaded correctly

```{r}
head(ufo_sightings)
head(places)
head(day_parts_map)
```

Saving data in respective files

```{r}
dir.create(here("data", "2023", "2023-06-20"), recursive = TRUE, showWarnings = FALSE)

write_csv(ufo_sightings, here("data", "2023", "2023-06-20", "ufo_sightings.csv"))
write_csv(places, here("data", "2023", "2023-06-20", "places.csv"))
write_csv(day_parts_map, here("data", "2023", "2023-06-20", "day_parts_map.csv"))
```

Checking missing data

```{r}
glimpse(ufo_sightings)
glimpse(places)
glimpse(day_parts_map)
```

```{r}
colSums(is.na(ufo_sightings))
```

```{r}
colSums(is.na(places))
```

```{r}
colSums(is.na(day_parts_map))
```

Summary of the data

```{r}
dim(ufo_sightings)
summary(ufo_sightings)
```

```{r}
dim(places)
summary(places)
```

```{r}
dim(day_parts_map)
summary(day_parts_map)
```

```{r}
sum(duplicated(ufo_sightings))
```

# Handle missing observations (fill in or remove them), correct errors, etc.

Due to the large size of the file on which the analysis is performed, only some of the data was used for visual representation.

```{r}
library(tidyverse)  
library(here)       
library(withr)
library(naniar)
```

## UFO sightings

```{r}
ufo_sightings %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)
```

Replacing missing data with most frequent entry and saving cleaned data

```{r}
most_common_day_part <- ufo_sightings %>%
  count(day_part) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  pull(day_part)

most_common_shape <- ufo_sightings %>%
  count(shape) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  pull(shape)

ufo_clean <- ufo_sightings %>%
  mutate(
    day_part = ifelse(is.na(day_part), most_common_day_part, day_part),
    shape = ifelse(is.na(shape), most_common_shape, shape)
  )

write_csv(ufo_clean, here("data", "2023", "2023-06-20", "ufo_clean.csv"))

```

```{r}
ufo_clean %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)
```

## Places

```{r}
places %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)
```

Replacing missing city name with empty string, elevation with mean value and saving cleaned data

```{r}
places_clean <- places %>%
  mutate(
    alternate_city_names = ifelse(is.na(alternate_city_names), " ", alternate_city_names)
  )

median_elevation <- median(places$elevation_m, na.rm = TRUE)

places_clean <- places_clean %>%
  mutate(
    elevation_m = ifelse(is.na(elevation_m), median_elevation, elevation_m)
  )

write_csv(places_clean, here("data", "2023", "2023-06-20", "places_clean.csv"))
```

```{r}
places_clean %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)
```

## Day parts map

```{r}
day_parts_map %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)
```

Replacing missing data with mean value and saving cleaned data

```{r}
day_parts_clean <- day_parts_map %>%
  mutate(
    astronomical_twilight_begin = ifelse(
      is.na(astronomical_twilight_begin),
      median(astronomical_twilight_begin, na.rm = TRUE),
      astronomical_twilight_begin
    ),
    astronomical_twilight_end = ifelse(
      is.na(astronomical_twilight_end),
      median(astronomical_twilight_end, na.rm = TRUE),
      astronomical_twilight_end
    )
  )

write_csv(day_parts_clean, here("data", "2023", "2023-06-20", "day_parts_clean.csv"))
```

```{r}
day_parts_clean %>% 
  dplyr::slice_sample(n = 1000) %>% 
  vis_miss(cluster = TRUE, sort_miss = TRUE)
```

# Adjust the data format to meet the requirements of your analysis

```{r}

ufo_model_data <- ufo_clean %>%
  filter(!is.na(shape), !is.na(reported_duration), !is.na(summary))


places_model_data <- places_clean %>%
  filter(!is.na(alternate_city_names))

```

```{r}
glimpse(ufo_model_data)
glimpse(places_model_data)
glimpse(day_parts_clean)
```

```{r}
write_csv(ufo_model_data, here("data", "2023", "2023-06-20", "ufo_model_data.csv"))
write_csv(places_model_data, here("data", "2023", "2023-06-20", "places_model_data.csv"))
```

# Add new columns to each dataframe

```{r}
library(lubridate)
library(dplyr)
library(hms)
```

Adding new columns to "sightings" dataframe

```{r}
ufo_model_data_mutated <- ufo_model_data %>%
  mutate(
    year = year(reported_date_time),
    month = month(reported_date_time),
    weekday = wday(reported_date_time, label = TRUE, abbr = FALSE, locale = "C"),
    is_weekend = weekday %in% c("Sat", "Sun"),
    country_upper = toupper(country_code),
    report_hour = hour(reported_date_time),
    city_state = paste(city, state, sep = ", "),
    report_delay_days = as.numeric(difftime(posted_date, as.Date(reported_date_time), units = "days"))
  )

ufo_model_data_mutated
```

-   `year`: Extracts the year from the `reported_date_time`.
-   `month`: Extracts the month (1–12) from the report time-stamp.
-   `weekday`: Returns the weekday name from the date.
-   `is_weekend`: Logical column: `TRUE` if the day is Saturday or Sunday, `FALSE` otherwise.
-   `country_upper`: Converts the `country_code` to uppercase.
-   `report_hour`: Extracts the hour (0–23) from the report time-stamp.
-   `city_state`: Concatenates `city` and `state` into a single string.
-   `report_delay_days`: Calculates the delay in days between when the event was reported and when it was posted.

Adding new columns to "places" dataframe

```{r}
places_model_data_mutated <- places_model_data %>%
  mutate(
    city_state = paste(city, state, sep = ", "),
    is_us = country_code == "US",
    population_log = log1p(population),
    hemisphere = ifelse(latitude >= 0, "Northern", "Southern"),
    is_coastal = abs(longitude) < 80 | abs(longitude) > 120,
    pop_category = case_when(
      population < 10000 ~ "small",
      population < 100000 ~ "medium",
      TRUE ~ "large"
    ),
    elevation_category = case_when(
      is.na(elevation_m) ~ "unknown",
      elevation_m < 100 ~ "low",
      elevation_m < 500 ~ "medium",
      TRUE ~ "high"
    ),
    name_length = nchar(city),
    timezone_area = sapply(strsplit(timezone, "/"), `[`, 2)
  )

places_model_data_mutated
```

-   `city_state`: Combines `city` and `state` into a single string.
-   `is_us`: Logical value: `TRUE` if the location is in the United States else `FALSE`.
-   `population_log`: Log-transformed population.
-   `hemisphere`: `"Northern"` if latitude is ≥ 0, `"Southern"` otherwise.
-   `is_coastal`: Logical: `TRUE` if longitude is outside the range [80, 120] in absolute value — a rough coastal proxy.
-   `pop_category`: Categorizes places based on population: `"small"`, `"medium"`, or `"large"`.
-   `elevation_category`: Classifies elevation: `"low"` (\<100 m), `"medium"` (\<500 m), `"high"` (≥500 m), or `"unknown"` if NA.
-   `name_length`: The number of characters in the city name.
-   `timezone_area`: Extracts the second part of the timezone string.

Adding new columns to "day parts" dataframe

```{r}
day_parts_model_mutated <- day_parts_clean %>%
  mutate(
    daylight_duration = as.numeric(sunset - sunrise, units = "secs"),
    is_northern_hemisphere = rounded_lat >= 0,
    sunrise_hour = hour(sunrise),
    sunset_hour = hour(sunset),
    is_day_short = daylight_duration < 36000, # mniej niż 10h
    twilight_duration = as.numeric(astronomical_twilight_end - astronomical_twilight_begin, units = "secs"),
    is_long_twilight = twilight_duration > 5400, # 1.5h
    sunrise_minutes = hour(sunrise) * 60 + minute(sunrise),
    solar_noon_minutes = hour(solar_noon) * 60 + minute(solar_noon),
    sunset_minutes = hour(sunset) * 60 + minute(sunset)
  )

day_parts_model_mutated
```

-   `daylight_duration`: The length of the day in seconds — difference between `sunset` and `sunrise`.
-   `is_northern_hemisphere`: Logical: `TRUE` if the location is in the Northern Hemisphere.
-   `sunrise_hour`: The hour (0–23) when the sun rises.
-   `sunset_hour`: The hour (0–23) when the sun sets.
-   `is_day_short`: Logical: `TRUE` if the day is shorter than 10 hours
-   `twilight_duration`: Duration of astronomical twilight in seconds — time between `astronomical_twilight_begin` and `end`.
-   `is_long_twilight`: Logical: `TRUE` if twilight duration is longer than 1.5 hours
-   `sunrise_minutes`: Sunrise time in total minutes from midnight.
-   `solar_noon_minutes`: Solar noon time in minutes from midnight.
-   `sunset_minutes`: Sunset time in minutes from midnight.

```{r}
glimpse(ufo_model_data_mutated)
glimpse(places_model_data_mutated)
glimpse(day_parts_clean)
```

# Explore data with charts

```{r}
library(ggplot2)
library(dplyr)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
```

##Number of sightings per day

```{r}
ufo_model_data_mutated %>%
  count(date = as.Date(reported_date_time)) %>%
  ggplot(aes(x = date, y = n)) +
  geom_line(color = "steelblue") +
  labs(title = "Number of sightings per day", x = "Date", y = "Number of sightings")
```

**Interpretation:**

The chart shows daily UFO sightings over time. Sightings were rare before 1960, gradually increased through the 1990s, and peaked between 2000 and 2015. After 2015, the number of reports declined sharply. This suggests that UFO sightings may be influenced by media, public interest, or reporting practices.

## Annual Trend of UFO Sightings

```{r}
ufo_model_data_mutated %>%
  mutate(year = lubridate::year(reported_date_time)) %>%
  count(year) %>%
  ggplot(aes(x = year, y = n)) +
  geom_line(color = "darkblue") +
  geom_smooth(se = FALSE, color = "red", method = "loess") +
  labs(
    title = "Annual Trend of UFO Sightings",
    x = "Year",
    y = "Number of Sightings"
  ) +
  theme_minimal()

```

**Interpretation:**

The chart shows a clear rise in UFO sightings from the 1980s to around 2012, with a peak near 2014. After that, there’s a sharp decline in reports. The red loess curve highlights a long-term upward trend followed by a recent downward shift. This may reflect changes in reporting behavior, public interest, or data availability over time.

##Number of sightings depending on the day of the week

```{r}
ufo_model_data_mutated %>%
  count(weekday) %>%
  ggplot(aes(x = weekday, y = n)) +
  geom_col(fill = "orange") +
  labs(title = "Sightings depending on the day of the week", x = "Day of the week", y = "Number of sightings")
```

**Interpretation:**

The number of UFO sightings varies by day of the week. The highest counts occur on Saturdays and Sundays, while Tuesdays have the fewest reports. This suggests people are more likely to notice and report sightings during weekends, possibly due to having more free time or being outdoors more often.

## Hourly distribution of sightings

```{r}
ufo_model_data_mutated %>%
  mutate(hour = hour(reported_date_time)) %>%
  count(hour) %>%
  ggplot(aes(x = hour, y = n)) +
  geom_col(fill = "purple") +
  labs(title = "Hourly distribution of sightings", x = "Hour of the day", y = "Number of sightings")
```

**Interpretation:**

UFO sightings are most frequently reported between 8 PM and 3 AM, peaking around 2 AM. Sightings are least common during midday hours. This pattern suggests that sightings are more likely to occur—or at least be noticed and reported—at night, when the sky is dark and unusual lights are more visible.

## Heatmap: day of the week vs hour of the day

```{r}
ufo_model_data_mutated %>%
  mutate(
    hour = hour(reported_date_time),
    weekday = fct_relevel(weekday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))
  ) %>%
  count(weekday, hour) %>%
  ggplot(aes(x = hour, y = weekday, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c() +
  labs(title = "Heatmap: day of the week vs hour of the day", x = "Hour of the day", y = "Day of the week", fill = "Number of sightings")

```

**Interpretation:**

The heatmap shows the distribution of UFO sightings by hour of the day and day of the week. Most sightings occur after midnight on Sunday, peaking between 1–3 AM. Other late-night hours, especially on weekends, also show elevated counts. This pattern reinforces that sightings are more frequent during late-night weekend hours, when people are likely to be awake and outdoors in dark conditions.

## Sightins with images vs no image

```{r}
ufo_model_data_mutated %>%
  mutate(image_status = ifelse(has_images, "Has an image", "Has no image")) %>%
  count(image_status) %>%
  ggplot(aes(x = "", y = n, fill = image_status)) +
  geom_col(width = 1) +
  coord_polar(theta = "y") +
  labs(title = "Sightins with images vs no image", fill = "Image existence") +
  theme_void() +
  scale_fill_manual(values = c("Has an image" = "#66BB6A", "Has no image" = "#EF5350"))
```

**Interpretation:**

The chart shows that almost all UFO sightings lack images. Sightings with images are extremely rare, suggesting that reports are usually text-based or anecdotal. This indicates a strong reliance on witness testimony rather than visual evidence in the dataset.

Making sure if above piechart is correct

```{r}
sum(ufo_model_data_mutated$has_images != FALSE, na.rm = TRUE)
```

## Number of sightings per shape

```{r}
ufo_model_data_mutated %>%
  count(shape) %>%
  ggplot(aes(x = reorder(shape, n), y = n)) +
  geom_col(fill = "skyblue") +
  coord_flip() +
  labs(title = "Number of sightings per shape", x = "Shape", y = "Number of sightings")
```

**Interpretation:**

The most commonly reported UFO shapes are light, circle, and triangle. Unusual shapes like cube, star, and cross are very rare. This suggests that most sightings describe simple or glowing forms, possibly influenced by visibility, perception, or common cultural imagery.

## Number of sightings per country

```{r}
ufo_model_data_mutated %>%
  count(country_code) %>%
  filter(n >= 100) %>%
  ggplot(aes(x = reorder(country_code, n), y = n)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(
    title = "Number of sightings per country",
    x = "Country code",
    y = "Number of sightings"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

**Interpretation:**

The vast majority of UFO sightings come from the United States, with over 80,000 reports. Other countries like Canada and Great Britain have significantly fewer sightings. This suggests that the dataset is strongly US-centric, possibly due to better reporting infrastructure, public interest, or data source bias.

## Density map of sightings

```{r}
world <- ne_countries(scale = "medium", returnclass = "sf")

places_clean %>%
  filter(!is.na(latitude), !is.na(longitude)) %>%
  mutate(
    pop_category = case_when(
      population < 10000  ~ "small",
      population < 100000 ~ "medium",
      TRUE                ~ "large"
    )
  ) %>%
  ggplot() +
  geom_sf(data = world, fill = "lightgray", color = "black") +  # Dodajemy mapę
  geom_point(aes(
    x = longitude, y = latitude,
    size = population, color = pop_category
  ), alpha = 0.6) +
  scale_size(range = c(1, 6), guide = "none") +
  labs(
    title = "Cities with UFO sightings",
    subtitle = "Point size ~ population, color ~ population category",
    x = "Latitude",
    y = "Altitude",
    color = "Population category"
  ) +
  theme_minimal()
```

**Interpretation:**

Sightings are most densely clustered in North America and Europe, especially in large urban areas. This suggests that population density and infrastructure may influence reporting frequency. Other regions show fewer reports, which could reflect lower reporting access or less data availability.

## UFO Sightings by City Population Category

```{r}
ufo_model_data_mutated %>%
  left_join(places_model_data_mutated %>% select(city_state, pop_category), by = "city_state") %>%
  count(pop_category) %>%
  ggplot(aes(x = pop_category, y = n, fill = pop_category)) +
  geom_col(show.legend = FALSE) +
  labs(
    title = "UFO Sightings by City Population Category",
    x = "Population Category",
    y = "Number of Sightings"
  ) +
  scale_fill_manual(values = c("small" = "#91bfdb", "medium" = "#fdae61", "large" = "#d73027")) +
  theme_minimal()

```

**Interpretation:**

The chart shows that most UFO sightings come from medium-sized cities, followed by large cities, with small towns reporting the fewest. This suggests that mid-sized urban areas may offer a balance of visibility, outdoor activity, and public engagement conducive to sightings. It also reflects where people live and are most likely to report unusual events.
